{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "DataAnalysis"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "import re"
   ]
  },
  {
   "source": [
    "# Ch07 从文本提取信息\n",
    "\n",
    "学习目标\n",
    "\n",
    "1.  从非结构化文本中提取结构化数据\n",
    "2.  识别一个文本中描述的实体和关系\n",
    "3.  使用语料库来训练和评估模型"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "## 7.4 语言结构中的递归"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "### 7.4.1 使用级联分块器构建嵌套的结构"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ex7-6 四级分块语法器，处理NP（名词短语）PP（介绍短语）VP（动词短语）和$（句子的模式）\n",
    "grammar = r'''\n",
    "NP: {<DT|JJ|NN.*>+}             # Chunk sequences of DT, JJ, NN\n",
    "PP: {<IN><NP>}                  # Chunk prepositions followed by NP\n",
    "VP: {<VB.*><NP|PP|CLAUSE>+$}    # Chunk verbs and their arguments\n",
    "CLAUSE: {<NP><VP>}              # Chunk NP, VP\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "(S\n  (NP Mary/NN)\n  saw/VBD\n  (CLAUSE\n    (NP the/DT cat/NN)\n    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))\n"
     ]
    }
   ],
   "source": [
    "sentence = [(\"Mary\", \"NN\"),\n",
    "            (\"saw\", \"VBD\"),\n",
    "            (\"the\", \"DT\"),\n",
    "            (\"cat\", \"NN\"),\n",
    "            (\"sit\", \"VB\"),\n",
    "            (\"on\", \"IN\"),\n",
    "            (\"the\", \"DT\"),\n",
    "            (\"mat\", \"NN\")]\n",
    "cp = nltk.RegexpParser(grammar)\n",
    "print(cp.parse(sentence))            \n",
    "# 未能正确识别以 saw 为首的 VP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "(S\n  (CLAUSE\n    (NP Mary/NN)\n    (VP\n      saw/VBD\n      (CLAUSE\n        (NP the/DT cat/NN)\n        (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))\n"
     ]
    }
   ],
   "source": [
    "cp = nltk.RegexpParser(grammar, loop=2)\n",
    "print(cp.parse(sentence))\n",
    "# 增加循环次数，可以正确识别以 saw 为首的 VP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "(S\n  (NP John/NNP)\n  thinks/VBZ\n  (NP Mary/NN)\n  saw/VBD\n  (CLAUSE\n    (NP the/DT cat/NN)\n    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))\n"
     ]
    }
   ],
   "source": [
    "sentence = [(\"John\", \"NNP\"),\n",
    "            (\"thinks\", \"VBZ\"),\n",
    "            (\"Mary\", \"NN\"),\n",
    "            (\"saw\", \"VBD\"),\n",
    "            (\"the\", \"DT\"),\n",
    "            (\"cat\", \"NN\"),\n",
    "            (\"sit\", \"VB\"),\n",
    "            (\"on\", \"IN\"),\n",
    "            (\"the\", \"DT\"),\n",
    "            (\"mat\", \"NN\")]\n",
    "\n",
    "cp = nltk.RegexpParser(grammar)\n",
    "print(cp.parse(sentence))\n",
    "# 未能正确识别 saw 的 VP，也未能正确识别 thinks 的 VP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "(S\n  (NP John/NNP)\n  thinks/VBZ\n  (CLAUSE\n    (NP Mary/NN)\n    (VP\n      saw/VBD\n      (CLAUSE\n        (NP the/DT cat/NN)\n        (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))\n"
     ]
    }
   ],
   "source": [
    "cp = nltk.RegexpParser(grammar, loop=2)\n",
    "print(cp.parse(sentence))\n",
    "# 增加循环次数，可以正确识别以 saw 为首的 VP\n",
    "# 未能正确识别 thinks 的 VP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "(S\n  (CLAUSE\n    (NP John/NNP)\n    (VP\n      thinks/VBZ\n      (CLAUSE\n        (NP Mary/NN)\n        (VP\n          saw/VBD\n          (CLAUSE\n            (NP the/DT cat/NN)\n            (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))))\n"
     ]
    }
   ],
   "source": [
    "cp = nltk.RegexpParser(grammar, loop=3)\n",
    "print(cp.parse(sentence))\n",
    "# 增加循环次数，可以正确识别以 saw 为首的 VP，也可以正确识别 thinks 的 VP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "(S\n  (CLAUSE\n    (NP John/NNP)\n    (VP\n      thinks/VBZ\n      (CLAUSE\n        (NP Mary/NN)\n        (VP\n          saw/VBD\n          (CLAUSE\n            (NP the/DT cat/NN)\n            (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))))\n"
     ]
    }
   ],
   "source": [
    "cp = nltk.RegexpParser(grammar, loop=4)\n",
    "print(cp.parse(sentence))\n",
    "# 更多的循环次数不会影响结果的正确性，只会增加计算的时间"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "(S\n  (CLAUSE\n    (NP John/NNP)\n    (VP\n      thinks/VBZ\n      (CLAUSE\n        (NP Mary/NN)\n        (VP\n          saw/VBD\n          (CLAUSE\n            (NP the/DT cat/NN)\n            (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))))\n"
     ]
    }
   ],
   "source": [
    "cp = nltk.RegexpParser(grammar, loop=4)\n",
    "parsed_sent = cp.parse(sentence)\n",
    "print(parsed_sent)\n",
    "parsed_sent.draw()"
   ]
  },
  {
   "source": [
    "虽然级联过程可以创建深层结构，但是创建与调试过程非常困难，并且只能产生固定深度的树状图，仍然属于不完整的句法分析，因此，全面的剖析才更有效（Ref：Ch8）"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "### 7.4.2. 树状图\n",
    "树状图：是一组相互连接的加标签的节点，从一个特殊的根节点沿一条唯一的路径到达每个节点。"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "(NP Alice)\n"
     ]
    }
   ],
   "source": [
    "# 创建树状图\n",
    "tree1 = nltk.Tree('NP', ['Alice'])\n",
    "print(tree1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "(NP the rabbit)\n"
     ]
    }
   ],
   "source": [
    "tree2 = nltk.Tree('NP', ['the', 'rabbit'])\n",
    "print(tree2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "(VP chased (NP the rabbit))\n"
     ]
    }
   ],
   "source": [
    "# 合并成大的树状图\n",
    "tree3 = nltk.Tree('VP', ['chased', tree2])\n",
    "print(tree3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "(S (NP Alice) (VP chased (NP the rabbit)))\n"
     ]
    }
   ],
   "source": [
    "tree4 = nltk.Tree('S', [tree1, tree3])\n",
    "print(tree4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "tree4=  (S (NP Alice) (VP chased (NP the rabbit)))\ntree4[0]=  (NP Alice)\ntree4[1]=  (VP chased (NP the rabbit))\ntree4[1][1]=  (NP the rabbit)\ntree4[1][1][0]=  the\n"
     ]
    }
   ],
   "source": [
    "# 访问树状图的对象\n",
    "print(\"tree4= \", tree4)\n",
    "print(\"tree4[0]= \", tree4[0])\n",
    "print(\"tree4[1]= \", tree4[1])\n",
    "print(\"tree4[1][1]= \", tree4[1][1])\n",
    "print(\"tree4[1][1][0]= \", tree4[1][1][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "tree4=  (S (NP Alice) (VP chased (NP the rabbit)))\ntree4.label()=  S\ntree4.leaves()=  ['Alice', 'chased', 'the', 'rabbit']\n"
     ]
    }
   ],
   "source": [
    "# 调用树状图的函数\n",
    "print(\"tree4= \", tree4)\n",
    "print(\"tree4.label()= \", tree4.label())\n",
    "print(\"tree4.leaves()= \", tree4.leaves())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "tree4[1]=  (VP chased (NP the rabbit))\ntree4[1].label()=  VP\ntree4[1].leaves()=  ['chased', 'the', 'rabbit']\n"
     ]
    }
   ],
   "source": [
    "print(\"tree4[1]= \", tree4[1])\n",
    "print(\"tree4[1].label()= \", tree4[1].label())\n",
    "print(\"tree4[1].leaves()= \", tree4[1].leaves())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "tree4[1][1]=  (NP the rabbit)\ntree4[1][1].label()=  NP\ntree4[1][1].leaves()=  ['the', 'rabbit']\n"
     ]
    }
   ],
   "source": [
    "print(\"tree4[1][1]= \", tree4[1][1])\n",
    "print(\"tree4[1][1].label()= \", tree4[1][1].label())\n",
    "print(\"tree4[1][1].leaves()= \", tree4[1][1].leaves())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "tree4[1][1][0]=  the\n"
     ]
    }
   ],
   "source": [
    "print(\"tree4[1][1][0]= \", tree4[1][1][0])\n",
    "# print(\"tree4[1][1][0].label()= \", tree4[1][1][0].label())\n",
    "# print(\"tree4[1][1][0].leaves()= \", tree4[1][1][0].leaves())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "tree4.draw()"
   ]
  },
  {
   "source": [
    "### 7.4.3 树的遍历"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ex7-7 使用递归函数遍历树状图\n",
    "def traverse(t):\n",
    "    try:\n",
    "        t.label()\n",
    "    except AttributeError:\n",
    "        print(t, end=' ')\n",
    "    else:\n",
    "        print('(', t.label(), end=' ')\n",
    "        for child in t:\n",
    "            traverse(child)\n",
    "        print(')', end=' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "(S (NP Alice) (VP chased (NP the rabbit)))\n( S ( NP Alice ) ( VP chased ( NP the rabbit ) ) ) "
     ]
    }
   ],
   "source": [
    "# 不能使用Tree()函数直接基于字符串生成树了。\n",
    "t = nltk.Tree.fromstring('(S (NP Alice) (VP chased (NP the rabbit)))')\n",
    "print(t)\n",
    "traverse(t)\n",
    "t.draw()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "(S (NP Alice) (VP chased (NP the rabbit)))\n( S ( NP Alice ) ( VP chased ( NP the rabbit ) ) ) "
     ]
    }
   ],
   "source": [
    "# 不能使用Tree()函数直接基于字符串生成树了。\n",
    "t = nltk.Tree.fromstring(tree4.__str__())\n",
    "print(t)\n",
    "traverse(t)\n",
    "t.draw()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "(S (NP Alice) (VP chased (NP the rabbit)))\n( S ( NP Alice ) ( VP chased ( NP the rabbit ) ) ) "
     ]
    }
   ],
   "source": [
    "print(tree4)\n",
    "traverse(tree4)\n",
    "tree4.draw()"
   ]
  }
 ]
}